Customer Churn Analysis
# Data Understanding and Exploration:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.shape
(7043, 21)
df.head()
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
df.columns.values
array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'], dtype=object)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
df.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
df.describe()
SeniorCitizen | tenure | MonthlyCharges | |
---|---|---|---|
count | 7043.000000 | 7043.000000 | 7043.000000 |
mean | 0.162147 | 32.371149 | 64.761692 |
std | 0.368612 | 24.559481 | 30.090047 |
min | 0.000000 | 0.000000 | 18.250000 |
25% | 0.000000 | 9.000000 | 35.500000 |
50% | 0.000000 | 29.000000 | 70.350000 |
75% | 0.000000 | 55.000000 | 89.850000 |
max | 1.000000 | 72.000000 | 118.750000 |
cda = df.copy()
cda.TotalCharges = pd.to_numeric(cda.TotalCharges, errors = 'coerce')
cda.describe()
SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
---|---|---|---|---|
count | 7043.000000 | 7043.000000 | 7043.000000 | 7032.000000 |
mean | 0.162147 | 32.371149 | 64.761692 | 2283.300441 |
std | 0.368612 | 24.559481 | 30.090047 | 2266.771362 |
min | 0.000000 | 0.000000 | 18.250000 | 18.800000 |
25% | 0.000000 | 9.000000 | 35.500000 | 401.450000 |
50% | 0.000000 | 29.000000 | 70.350000 | 1397.475000 |
75% | 0.000000 | 55.000000 | 89.850000 | 3794.737500 |
max | 1.000000 | 72.000000 | 118.750000 | 8684.800000 |
cda.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64
cda.loc[cda['TotalCharges'].isnull()== True]
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
488 | 4472-LVYGI | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | NaN | No |
753 | 3115-CZMZD | Male | 0 | No | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | NaN | No |
936 | 5709-LVOEQ | Female | 0 | Yes | Yes | 0 | Yes | No | DSL | Yes | ... | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | NaN | No |
1082 | 4367-NUYAO | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | NaN | No |
1340 | 1371-DWPAZ | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | NaN | No |
3331 | 7644-OMVMY | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | NaN | No |
3826 | 3213-VVOLG | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | NaN | No |
4380 | 2520-SGTTA | Female | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | NaN | No |
5218 | 2923-ARZLG | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | NaN | No |
6670 | 4075-WKNIU | Female | 0 | Yes | Yes | 0 | Yes | Yes | DSL | No | ... | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | NaN | No |
6754 | 2775-SEFEE | Male | 0 | No | Yes | 0 | Yes | Yes | DSL | Yes | ... | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | NaN | No |
11 rows × 21 columns
cda['MonthlyCharges'].corr(cda['TotalCharges'])
0.6510648032262024
cda[cda['MonthlyCharges'].between(19, 25)].median()
SeniorCitizen 0.000 tenure 22.000 MonthlyCharges 20.050 TotalCharges 460.225 dtype: float64
cda[cda['MonthlyCharges'].between(50, 80)].median()
SeniorCitizen 0.00 tenure 22.00 MonthlyCharges 69.20 TotalCharges 1395.05 dtype: float64
cda['TotalCharges'] = np.where((cda['TotalCharges'].isnull()) & (cda['MonthlyCharges'].between(19, 26)), 460.225, cda['TotalCharges'])
# cda.loc[cda['TotalCharges'].isnull()== True]
cda['TotalCharges'] = np.where((cda['TotalCharges'].isnull()) & (cda['MonthlyCharges'].between(52, 81)), 1395.05, cda['TotalCharges'])
cda.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
cda.describe()
SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
---|---|---|---|---|
count | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 |
mean | 0.162147 | 32.371149 | 64.761692 | 2281.116754 |
std | 0.368612 | 24.559481 | 30.090047 | 2265.747871 |
min | 0.000000 | 0.000000 | 18.250000 | 18.800000 |
25% | 0.000000 | 9.000000 | 35.500000 | 402.225000 |
50% | 0.000000 | 29.000000 | 70.350000 | 1395.050000 |
75% | 0.000000 | 55.000000 | 89.850000 | 3786.600000 |
max | 1.000000 | 72.000000 | 118.750000 | 8684.800000 |
cda.duplicated().sum()
0
cda['tenure'].min()
0
cda['tenure'].max()
72
max_tenure = 72
bins = range(0, max_tenure + 13, 12) # Increment the upper limit of the range by 12
labels = ["{0} - {1}".format(i, min(i + 11, max_tenure)) for i in bins[:-1]]
cda['tenure_group'] = pd.cut(cda['tenure'], bins=bins, labels=labels, right=False)
cda['tenure_group'].value_counts()
0 - 11 2069 60 - 71 1121 12 - 23 1047 24 - 35 876 48 - 59 820 36 - 47 748 72 - 72 362 Name: tenure_group, dtype: int64
cda.drop(['customerID'], axis=1, inplace=True)
cda.head()
gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | ... | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | tenure_group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | ... | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No | 0 - 11 |
1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | ... | No | No | No | One year | No | Mailed check | 56.95 | 1889.50 | No | 24 - 35 |
2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | ... | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes | 0 - 11 |
3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | ... | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No | 36 - 47 |
4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | ... | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes | 0 - 11 |
5 rows × 21 columns
cda['Churn'].value_counts()
No 5174 Yes 1869 Name: Churn, dtype: int64
cda['Churn'].value_counts(normalize=True)
No 0.73463 Yes 0.26537 Name: Churn, dtype: float64
# EDA:
def plot_show(data, x_column, hue_column, palette=None, figsize=(6, 4)):
plt.figure(figsize=figsize)
sns.countplot(x=x_column, hue=hue_column, data=data, palette={'No': '#1984c5', 'Yes': '#a7d5ed'})
x_label = x_column
y_label = 'Count'
plt.xlabel(x_label)
plt.ylabel(y_label)
# Add annotations for 'No' and 'Yes' counts in 'Churn' column
ax = plt.gca() # Get the current axes
for p in ax.patches:
if p.get_height() > 0: # Only add text for non-zero counts
ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='bottom')
plt.show()
plot_show(cda, 'gender', 'Churn')
def calculate_count_and_percentage(data, column_name):
# Calculate count and percentage for each category in the specified column
count_percentage= data[column_name].value_counts(dropna=False).reset_index()
count_percentage.columns = [column_name, 'Count']
# Check if the column contains numeric values (integer or float)
if pd.api.types.is_numeric_dtype(data[column_name]):
count_percentage['Percentage'] = (count_percentage['Count'] / count_percentage['Count'].sum()) * 100
else:
# For non-numeric values, calculate percentage using total non-null values
total_non_null = data[column_name].count()
count_percentage['Percentage'] = (count_percentage['Count'] / total_non_null) * 100
# Sort the DataFrame by the column values
count_percentage.sort_values(by=column_name, inplace=True)
return count_percentage
calculate_count_and_percentage(cda, 'gender')
gender | Count | Percentage | |
---|---|---|---|
1 | Female | 3488 | 49.52435 |
0 | Male | 3555 | 50.47565 |
def calculate_churn_ratio(cda, column_name, target_column='Churn'):
# Grouping the data by the specified column and 'Churn' columns and calculating the counts
churn_counts = cda.groupby([column_name, target_column]).size().unstack()
# Calculating the total counts for each group in the specified column (Yes and No)
total_counts = cda[column_name].value_counts()
# Calculate the churn ratio (percentage) for 'Yes' and 'No' in the specified column
churn_ratio = (churn_counts.T / total_counts) * 100
# Transpose churn_ratio for better visual
churn_ratio = churn_ratio.T
return churn_ratio
calculate_churn_ratio(cda, 'gender')
Churn | No | Yes |
---|---|---|
Female | 73.079128 | 26.920872 |
Male | 73.839662 | 26.160338 |
2. How does the churn vary between senior citizens and non-senior citizens?
plot_show(cda, 'SeniorCitizen', 'Churn')
calculate_count_and_percentage(cda, 'SeniorCitizen')
SeniorCitizen | Count | Percentage | |
---|---|---|---|
0 | 0 | 5901 | 83.785319 |
1 | 1 | 1142 | 16.214681 |
calculate_churn_ratio(cda, 'SeniorCitizen')
Churn | No | Yes |
---|---|---|
SeniorCitizen | ||
0 | 76.393832 | 23.606168 |
1 | 58.318739 | 41.681261 |
3. What is the churn distribution for different Internet Service types?
plot_show(cda, 'InternetService', 'Churn')
calculate_count_and_percentage(cda, 'InternetService')
InternetService | Count | Percentage | |
---|---|---|---|
1 | DSL | 2421 | 34.374556 |
0 | Fiber optic | 3096 | 43.958540 |
2 | No | 1526 | 21.666903 |
calculate_churn_ratio(cda, 'InternetService')
Churn | No | Yes |
---|---|---|
DSL | 81.040892 | 18.959108 |
Fiber optic | 58.107235 | 41.892765 |
No | 92.595020 | 7.404980 |
4. How does the churn rate vary across different tenure groups (bins of 12 months)?
plot_show(cda, 'tenure_group', 'Churn', figsize=(10, 5))
calculate_count_and_percentage(cda, 'tenure_group')
tenure_group | Count | Percentage | |
---|---|---|---|
0 | 0 - 11 | 2069 | 29.376686 |
2 | 12 - 23 | 1047 | 14.865824 |
3 | 24 - 35 | 876 | 12.437882 |
5 | 36 - 47 | 748 | 10.620474 |
4 | 48 - 59 | 820 | 11.642766 |
1 | 60 - 71 | 1121 | 15.916513 |
6 | 72 - 72 | 362 | 5.139855 |
calculate_churn_ratio(cda, 'tenure_group')
Churn | No | Yes |
---|---|---|
0 - 11 | 51.715805 | 48.284195 |
12 - 23 | 70.487106 | 29.512894 |
24 - 35 | 77.968037 | 22.031963 |
36 - 47 | 80.481283 | 19.518717 |
48 - 59 | 85.000000 | 15.000000 |
60 - 71 | 91.703836 | 8.296164 |
72 - 72 | 98.342541 | 1.657459 |
missing_values = set(cda['tenure_group'].unique()) - set(cda['tenure_group'].dropna().unique()) print("Missing values in 'tenure_group':") print(missing_values)
cda[cda['tenure_group'].isnull()]
df.head(489)
df[df['tenure'] == 0]
5. What is the churn distribution for different payment methods?
plot_show(cda, 'PaymentMethod', 'Churn', figsize=(10, 5))
calculate_count_and_percentage(cda, 'PaymentMethod')
PaymentMethod | Count | Percentage | |
---|---|---|---|
2 | Bank transfer (automatic) | 1544 | 21.922476 |
3 | Credit card (automatic) | 1522 | 21.610109 |
0 | Electronic check | 2365 | 33.579441 |
1 | Mailed check | 1612 | 22.887974 |
calculate_churn_ratio(cda, 'PaymentMethod')
Churn | No | Yes |
---|---|---|
Bank transfer (automatic) | 83.290155 | 16.709845 |
Credit card (automatic) | 84.756899 | 15.243101 |
Electronic check | 54.714588 | 45.285412 |
Mailed check | 80.893300 | 19.106700 |
6. How does having dependents affect the likelihood of churn?
plot_show(cda, 'Dependents', 'Churn')
calculate_count_and_percentage(cda, 'Dependents')
Dependents | Count | Percentage | |
---|---|---|---|
0 | No | 4933 | 70.041176 |
1 | Yes | 2110 | 29.958824 |
calculate_churn_ratio(cda, 'Dependents')
Churn | No | Yes |
---|---|---|
Dependents | ||
No | 68.720860 | 31.279140 |
Yes | 84.549763 | 15.450237 |
7. Does having a partner influence the churn rate?
plot_show(cda, 'Partner', 'Churn')
calculate_count_and_percentage(cda, 'Partner')
Partner | Count | Percentage | |
---|---|---|---|
0 | No | 3641 | 51.69672 |
1 | Yes | 3402 | 48.30328 |
calculate_churn_ratio(cda, 'Partner')
Churn | No | Yes |
---|---|---|
Partner | ||
No | 67.042021 | 32.957979 |
Yes | 80.335097 | 19.664903 |
8. Does having online backup influence the likelihood of churn?
plot_show(cda, 'OnlineBackup', 'Churn')
calculate_count_and_percentage(cda, 'OnlineBackup')
OnlineBackup | Count | Percentage | |
---|---|---|---|
0 | No | 3088 | 43.844952 |
2 | No internet service | 1526 | 21.666903 |
1 | Yes | 2429 | 34.488144 |
calculate_churn_ratio(cda, 'OnlineBackup')
Churn | No | Yes |
---|---|---|
No | 60.071244 | 39.928756 |
No internet service | 92.595020 | 7.404980 |
Yes | 78.468506 | 21.531494 |
9. What is the churn distribution for different types of contracts (Month-to-month, One-year, Two-year)?
plot_show(cda, 'Contract', 'Churn')
calculate_count_and_percentage(cda, 'Contract')
Contract | Count | Percentage | |
---|---|---|---|
0 | Month-to-month | 3875 | 55.019168 |
2 | One year | 1473 | 20.914383 |
1 | Two year | 1695 | 24.066449 |
calculate_churn_ratio(cda, 'Contract')
Churn | No | Yes |
---|---|---|
Month-to-month | 57.290323 | 42.709677 |
One year | 88.730482 | 11.269518 |
Two year | 97.168142 | 2.831858 |
10. What is the churn distribution based on whether the customer has phone service or not?
plot_show(cda, 'PhoneService', 'Churn')
calculate_count_and_percentage(cda, 'PhoneService')
PhoneService | Count | Percentage | |
---|---|---|---|
1 | No | 682 | 9.683374 |
0 | Yes | 6361 | 90.316626 |
calculate_churn_ratio(cda, 'PhoneService')
Churn | No | Yes |
---|---|---|
No | 75.073314 | 24.926686 |
Yes | 73.290363 | 26.709637 |
11. Does having multiple phone lines influence the churn rate?
plot_show(cda, 'MultipleLines', 'Churn')
calculate_count_and_percentage(cda, 'MultipleLines')
MultipleLines | Count | Percentage | |
---|---|---|---|
0 | No | 3390 | 48.132898 |
2 | No phone service | 682 | 9.683374 |
1 | Yes | 2971 | 42.183729 |
calculate_churn_ratio(cda, 'MultipleLines')
Churn | No | Yes |
---|---|---|
No | 74.955752 | 25.044248 |
No phone service | 75.073314 | 24.926686 |
Yes | 71.390104 | 28.609896 |
12. What is the churn distribution for customers with or without online security?
plot_show(cda, 'OnlineSecurity', 'Churn')
calculate_count_and_percentage(cda, 'OnlineSecurity')
OnlineSecurity | Count | Percentage | |
---|---|---|---|
0 | No | 3498 | 49.666335 |
2 | No internet service | 1526 | 21.666903 |
1 | Yes | 2019 | 28.666761 |
calculate_churn_ratio(cda, 'OnlineSecurity')
Churn | No | Yes |
---|---|---|
No | 58.233276 | 41.766724 |
No internet service | 92.595020 | 7.404980 |
Yes | 85.388806 | 14.611194 |
13. What is the churn distribution for customers with or without tech support?
plot_show(cda, 'TechSupport', 'Churn')
calculate_count_and_percentage(cda, 'TechSupport')
TechSupport | Count | Percentage | |
---|---|---|---|
0 | No | 3473 | 49.311373 |
2 | No internet service | 1526 | 21.666903 |
1 | Yes | 2044 | 29.021724 |
calculate_churn_ratio(cda, 'TechSupport')
Churn | No | Yes |
---|---|---|
No | 58.364526 | 41.635474 |
No internet service | 92.595020 | 7.404980 |
Yes | 84.833659 | 15.166341 |
14. How does the availability of device protection affect churn?
plot_show(cda, 'DeviceProtection', 'Churn')
calculate_count_and_percentage(cda, 'DeviceProtection')
DeviceProtection | Count | Percentage | |
---|---|---|---|
0 | No | 3095 | 43.944342 |
2 | No internet service | 1526 | 21.666903 |
1 | Yes | 2422 | 34.388755 |
calculate_churn_ratio(cda, 'DeviceProtection')
Churn | No | Yes |
---|---|---|
No | 60.872375 | 39.127625 |
No internet service | 92.595020 | 7.404980 |
Yes | 77.497936 | 22.502064 |
15. Does streaming TV services impact the churn rate?
plot_show(cda, 'StreamingTV', 'Churn')
calculate_count_and_percentage(cda, 'StreamingTV')
StreamingTV | Count | Percentage | |
---|---|---|---|
0 | No | 2810 | 39.897771 |
2 | No internet service | 1526 | 21.666903 |
1 | Yes | 2707 | 38.435326 |
calculate_churn_ratio(cda, 'StreamingTV')
Churn | No | Yes |
---|---|---|
No | 66.476868 | 33.523132 |
No internet service | 92.595020 | 7.404980 |
Yes | 69.929812 | 30.070188 |
16. How does streaming movie services affect the likelihood of churn?
plot_show(cda, 'StreamingMovies', 'Churn')
calculate_count_and_percentage(cda, 'StreamingMovies')
StreamingMovies | Count | Percentage | |
---|---|---|---|
0 | No | 2785 | 39.542808 |
2 | No internet service | 1526 | 21.666903 |
1 | Yes | 2732 | 38.790288 |
calculate_churn_ratio(cda, 'StreamingMovies')
Churn | No | Yes |
---|---|---|
No | 66.319569 | 33.680431 |
No internet service | 92.595020 | 7.404980 |
Yes | 70.058565 | 29.941435 |
17. Does paperless billing influence the churn rate?
plot_show(cda, 'PaperlessBilling', 'Churn')
calculate_count_and_percentage(cda, 'PaperlessBilling')
PaperlessBilling | Count | Percentage | |
---|---|---|---|
1 | No | 2872 | 40.778078 |
0 | Yes | 4171 | 59.221922 |
calculate_churn_ratio(cda, 'PaperlessBilling')
Churn | No | Yes |
---|---|---|
No | 83.669916 | 16.330084 |
Yes | 66.434908 | 33.565092 |
18.Relationship between Monthly Charges and Total Charges
plt.figure(figsize=(10, 5))
plt.scatter(cda['MonthlyCharges'], cda['TotalCharges'], c= '#6F61C0', alpha=0.2)
plt.title('Relationship between Monthly Charges and Total Charges')
plt.xlabel('Monthly Charges')
plt.ylabel('Total Charges')
plt.show()
19. Churn rate by Monthly Charges and Total Charges
cmc = sns.FacetGrid(cda, hue="Churn", height=4, aspect=2)
cmc.map(sns.kdeplot, "MonthlyCharges", shade=True)
cmc.add_legend(title="Churn")
cmc.set_xlabels("Monthly Charges")
cmc.fig.suptitle("Churn by Monthly charges ", fontsize=16)
plt.show()
ctc = sns.FacetGrid(cda, hue="Churn", height=4, aspect=2)
ctc.map(sns.kdeplot, "TotalCharges", shade=True)
ctc.add_legend(title="Churn")
ctc.set_xlabels("Total Charges")
ctc.fig.suptitle("Churn by Total charges ", fontsize=16)
plt.show()
# Feature Engineering:
cda['Churn'] = cda['Churn'].map({'Yes':'1', 'No':'0'})
cda['Churn'] = pd.to_numeric(cda['Churn'], errors='coerce')
cda.head()
gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | ... | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | tenure_group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | ... | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | 0 | 0 - 11 |
1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | ... | No | No | No | One year | No | Mailed check | 56.95 | 1889.50 | 0 | 24 - 35 |
2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | ... | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | 1 | 0 - 11 |
3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | ... | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | 0 | 36 - 47 |
4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | ... | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | 1 | 0 - 11 |
5 rows × 21 columns
target0=cda.loc[cda["Churn"]==0]
target1=cda.loc[cda["Churn"]==1]
def plot(cda, col, hue=None):
sns.set_style('whitegrid')
sns.set_context('talk')
plt.rcParams["axes.labelsize"] = 15
plt.rcParams['axes.titlesize'] = 15
plt.rcParams['axes.titlepad'] = 20
plt.figure(figsize=(10, 4), )
ax = sns.countplot(data=cda, x=col, hue=hue, palette='Blues', order=cda[col].value_counts().index)
plt.xticks(rotation=45)
plt.yscale('log')
ax.yaxis.grid(False)
plt.show()
1. How having a partner affect churn behavior between gender?
plot(target1, col='Partner', hue='gender')
2. How being dependents for gender affected to churn?
plot(target1, col='Dependents', hue='gender')
3. How Tech support for gender affected to churn?
plot(target1, col='TechSupport', hue='gender')
4.How does the churn behavior vary based on the type of internet service and the presence of a partner?
- Plot the 'InternetService' column as the 'hue' and 'Partner' column as the 'col' for churned customers.
plot(target1, col='Partner', hue='InternetService')
5. Is there any difference in churn rates for customers with and without dependents, considering the different payment methods used?
- Plot the 'Dependents' column as the 'hue' and 'PaymentMethod' column as the 'col' for churned customers.
plot(target1, col='PaymentMethod', hue='Dependents')
6. Does the type of internet service have an impact on churn behavior, and does it differ when considering the contract type?
- Plot the 'InternetService' column as the 'hue' and 'Contract' column as the 'col' for churned customers.
plot(target1, col='Contract', hue='InternetService')
7. How does the churn distribution vary based on the payment method and the presence of tech support?
- Plot the 'PaymentMethod' column as the 'hue' and 'TechSupport' column as the 'col' for churned customers.
plot(target1, col='TechSupport', hue='PaymentMethod')
8. Are there any interactions between the contract type and paperless billing affecting churn?
- Plot the 'Contract' column as the 'hue' and 'PaperlessBilling' column as the 'col' for churned customers.
plot(target1, col='PaperlessBilling', hue='Contract')
9. How does churn behavior differ for customers with and without tech support, considering the type of internet service they have?
- Plot the 'TechSupport' column as the 'hue' and 'InternetService' column as the 'col' for churned customers.
plot(target1, col='InternetService', hue='TechSupport')
10. Is there any variation in churn rates based on the internet service and the paperless billing option?
- Plot the 'InternetService' column as the 'hue' and 'PaperlessBilling' column as the 'col' for churned customers.
plot(target1, col='PaperlessBilling', hue='InternetService')
11. Does the presence of phone service influence churn behavior? Compare churn rates for gender with and without phone service.
plot(target1, col='PhoneService', hue='gender')
12. Analyze churn rates for different internet service categories among gender.
plot(target1, col='InternetService', hue='gender')
13. Analyze churn rates for customers with and without tech support for gender?
plot(target1, col='TechSupport', hue='gender')
14. Does the churn distribution vary based on the tenure group and contract type?
plot(target1, col='Contract', hue='tenure_group')
15. Does the churn distribution vary based on the tenure group and gender?
plot(target1, col='tenure_group', hue='gender')
cda.drop(['tenure'], axis=1, inplace=True)
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group']
dummy_data = pd.get_dummies(cda, columns=categorical_columns, drop_first=True)
dummy_data = pd.get_dummies(cda.drop('Churn', axis=1), drop_first=True)
dummy_data.head()
SeniorCitizen | MonthlyCharges | TotalCharges | Churn | gender_Male | Partner_Yes | Dependents_Yes | PhoneService_Yes | MultipleLines_No phone service | MultipleLines_Yes | ... | PaperlessBilling_Yes | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | tenure_group_12 - 23 | tenure_group_24 - 35 | tenure_group_36 - 47 | tenure_group_48 - 59 | tenure_group_60 - 71 | tenure_group_72 - 72 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 29.85 | 29.85 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 56.95 | 1889.50 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 0 | 53.85 | 108.15 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 42.30 | 1840.75 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4 | 0 | 70.70 | 151.65 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 36 columns
Build a corelation of all predictors with 'Churn'
plt.figure(figsize=(12,5))
dummy_data.corr()['Churn'].sort_values(ascending=False).plot(kind='bar', grid=False)
<Axes: >
dummy_data.to_csv('cleandata.csv')